{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np \n",
    "import pandas as pd "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_path = '../data_sources/echp/'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_p = pd.read_stata(data_path+'a_w'+str(1)+'p.dta',convert_categoricals=False)\n",
    "df_p['pid'] = df_p['pid'].astype('int64')\n",
    "df_p.set_index(['country','hid','pid'],inplace=True)\n",
    "df_p = df_p.sort_index()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_r = pd.read_stata(data_path+'a_w'+str(1)+'rel.dta')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_h = pd.read_stata(data_path+'a_w'+str(1)+'h.dta',convert_categoricals=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th>wave</th>\n",
       "      <th>pg001</th>\n",
       "      <th>pg002</th>\n",
       "      <th>pg003</th>\n",
       "      <th>pg004</th>\n",
       "      <th>pg005</th>\n",
       "      <th>pg006</th>\n",
       "      <th>pg007</th>\n",
       "      <th>pg008</th>\n",
       "      <th>pd001</th>\n",
       "      <th>...</th>\n",
       "      <th>pm007a</th>\n",
       "      <th>pm007b</th>\n",
       "      <th>pm007c</th>\n",
       "      <th>pm008</th>\n",
       "      <th>pm010</th>\n",
       "      <th>pm011</th>\n",
       "      <th>pk001</th>\n",
       "      <th>pk002</th>\n",
       "      <th>pk003</th>\n",
       "      <th>pk004</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>country</th>\n",
       "      <th>hid</th>\n",
       "      <th>pid</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th rowspan=\"5\" valign=\"top\">1</th>\n",
       "      <th rowspan=\"2\" valign=\"top\">101</th>\n",
       "      <th>1101</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1.08041</td>\n",
       "      <td>1.08041</td>\n",
       "      <td>25</td>\n",
       "      <td>-8</td>\n",
       "      <td>-8</td>\n",
       "      <td>-8</td>\n",
       "      <td>1</td>\n",
       "      <td>1962</td>\n",
       "      <td>...</td>\n",
       "      <td>-8</td>\n",
       "      <td>-8</td>\n",
       "      <td>-8</td>\n",
       "      <td>1</td>\n",
       "      <td>-8</td>\n",
       "      <td>-8</td>\n",
       "      <td>4</td>\n",
       "      <td>4</td>\n",
       "      <td>5</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1102</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1.08041</td>\n",
       "      <td>1.08041</td>\n",
       "      <td>25</td>\n",
       "      <td>-8</td>\n",
       "      <td>-8</td>\n",
       "      <td>-8</td>\n",
       "      <td>1</td>\n",
       "      <td>1964</td>\n",
       "      <td>...</td>\n",
       "      <td>-8</td>\n",
       "      <td>-8</td>\n",
       "      <td>-8</td>\n",
       "      <td>1</td>\n",
       "      <td>-8</td>\n",
       "      <td>-8</td>\n",
       "      <td>5</td>\n",
       "      <td>1</td>\n",
       "      <td>6</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th rowspan=\"3\" valign=\"top\">201</th>\n",
       "      <th>2102</th>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>1.35196</td>\n",
       "      <td>1.35196</td>\n",
       "      <td>15</td>\n",
       "      <td>-8</td>\n",
       "      <td>-8</td>\n",
       "      <td>-8</td>\n",
       "      <td>1</td>\n",
       "      <td>1936</td>\n",
       "      <td>...</td>\n",
       "      <td>-8</td>\n",
       "      <td>-8</td>\n",
       "      <td>-8</td>\n",
       "      <td>1</td>\n",
       "      <td>-8</td>\n",
       "      <td>-8</td>\n",
       "      <td>3</td>\n",
       "      <td>4</td>\n",
       "      <td>5</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2103</th>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>1.35196</td>\n",
       "      <td>1.35196</td>\n",
       "      <td>-9</td>\n",
       "      <td>-8</td>\n",
       "      <td>-8</td>\n",
       "      <td>-8</td>\n",
       "      <td>3</td>\n",
       "      <td>1963</td>\n",
       "      <td>...</td>\n",
       "      <td>-8</td>\n",
       "      <td>-8</td>\n",
       "      <td>-8</td>\n",
       "      <td>1</td>\n",
       "      <td>-8</td>\n",
       "      <td>-8</td>\n",
       "      <td>3</td>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2104</th>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>1.35196</td>\n",
       "      <td>1.35196</td>\n",
       "      <td>10</td>\n",
       "      <td>-8</td>\n",
       "      <td>-8</td>\n",
       "      <td>-8</td>\n",
       "      <td>3</td>\n",
       "      <td>1967</td>\n",
       "      <td>...</td>\n",
       "      <td>-8</td>\n",
       "      <td>-8</td>\n",
       "      <td>-8</td>\n",
       "      <td>1</td>\n",
       "      <td>-8</td>\n",
       "      <td>-8</td>\n",
       "      <td>5</td>\n",
       "      <td>3</td>\n",
       "      <td>4</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 224 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                  wave  pg001    pg002    pg003  pg004  pg005  pg006  pg007  \\\n",
       "country hid pid                                                               \n",
       "1       101 1101     1      1  1.08041  1.08041     25     -8     -8     -8   \n",
       "            1102     1      1  1.08041  1.08041     25     -8     -8     -8   \n",
       "        201 2102     1      2  1.35196  1.35196     15     -8     -8     -8   \n",
       "            2103     1      2  1.35196  1.35196     -9     -8     -8     -8   \n",
       "            2104     1      2  1.35196  1.35196     10     -8     -8     -8   \n",
       "\n",
       "                  pg008  pd001  ...  pm007a  pm007b  pm007c  pm008  pm010  \\\n",
       "country hid pid                 ...                                         \n",
       "1       101 1101      1   1962  ...      -8      -8      -8      1     -8   \n",
       "            1102      1   1964  ...      -8      -8      -8      1     -8   \n",
       "        201 2102      1   1936  ...      -8      -8      -8      1     -8   \n",
       "            2103      3   1963  ...      -8      -8      -8      1     -8   \n",
       "            2104      3   1967  ...      -8      -8      -8      1     -8   \n",
       "\n",
       "                  pm011  pk001  pk002  pk003  pk004  \n",
       "country hid pid                                      \n",
       "1       101 1101     -8      4      4      5      5  \n",
       "            1102     -8      5      1      6      4  \n",
       "        201 2102     -8      3      4      5      5  \n",
       "            2103     -8      3      2      3      2  \n",
       "            2104     -8      5      3      4      2  \n",
       "\n",
       "[5 rows x 224 columns]"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_p.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "waves = [x for x in range(1,9)]\n",
    "df_s = []\n",
    "pvars = ['wave','hid','pd001','pd003','pd004','pd005','pi100','pi110','pi211m','pi211mg', 'pt022','country','pe001','pe032','pg002','pid']\n",
    "for w in waves:\n",
    "\t# person file\n",
    "\tdf_p = pd.read_stata(data_path+'a_w'+str(w)+'p.dta',convert_categoricals=False)\n",
    "\tdf_p = df_p[pvars]\n",
    "\tdf_p['pid'] = df_p['pid'].astype('int64')\n",
    "\tdf_p = df_p.drop_duplicates(subset=['country','hid','pid'],keep='last')\n",
    "\tdf_p.set_index(['country','hid','pid'],inplace=True)\n",
    "\tdf_p = df_p.sort_index()\t\n",
    "\t# relations\n",
    "\tdf_r = pd.read_stata(data_path+'a_w'+str(w)+'rel.dta',convert_categoricals=False)\n",
    "\tdf_r = df_r.rename({'pid1':'pid'},axis=1)\n",
    "\tdf_r['pid'] = df_r['pid'].astype('int64')\n",
    "\tdf_r = df_r[['country','hid','pid','relation']]\n",
    "\tdf_r = df_r[df_r['relation'].isin([0,1])]\n",
    "\tdf_r = df_r.drop_duplicates(subset=['country','hid','pid'],keep='last')\n",
    "\tdf_r.set_index(['country','hid','pid'],inplace=True)\n",
    "\tdf = df_r.merge(df_p,left_index=True,right_index=True,how='left')\n",
    "\t# household file\n",
    "\tdf_h = pd.read_stata(data_path+'a_w'+str(w)+'h.dta',convert_categoricals=False)\n",
    "\tdf_h = df_h[['hid','country','hi100']]\n",
    "\tdf_h.set_index(['hid','country'],inplace=True)\n",
    "\tdf_h = df_h.sort_index()\n",
    "\t# merging \n",
    "\tdf_m = df.reset_index().merge(df_h,left_on=['hid','country'],right_on=['hid','country'],how='left')\n",
    "\tdf_m = df_m.reset_index()\n",
    "\tdf_m.set_index(['country','hid','pid'],inplace=True)\n",
    "\tdf_s.append(df_m)\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = df_s[0]\n",
    "for d in df_s[1:]:\n",
    "\tdf = df.append(d)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "497001"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(df)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Age"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = df.rename({'pd003':'age'},axis=1)\n",
    "df = df[df['age']>=0]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Income $Y$. We use total net (after tax) household income. \n",
    "\n",
    "HI100 TOTAL NET HOUSEHOLD INCOME (DETAILED, NC, TOTAL YEAR PRIOR TO THE SURVEY)\n",
    "\n",
    "In Italy, currency in liras, but in thousands"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = df.rename({'hi100':'Y'},axis=1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Drop countries we do not need. Sweden missing because NA. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = df[df.index.get_level_values(0).isin([51,2,3,6,9,11])]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Map names of countries"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "co_map = {51:'DE',2:'DK',3:'NL',6:'FR',9:'IT',11:'SP'}\n",
    "df['co'] = df.index.get_level_values(0).to_list()\n",
    "df['co'] = df['co'].replace(co_map)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Only keep dominants in households"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = df[df['relation']==0]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Create a year variables"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "df['year'] = df['wave'].replace({1:1994,2:1995,3:1996,4:1997,5:1998,6:1999,7:2000,8:2021})"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Keep variables of interest"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = df[['wave','year','age','co','Y']]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We will use mostly logs to estimate covariance structures. Therefore, we will normalize incomes using means. \n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = df[df['Y']>0]\n",
    "for c in ['DE','DK','NL','FR','IT','SP']:\n",
    "    df.loc[df.co==c,'Y'] = df.loc[df.co==c,'Y']/df.loc[df.co==c,'Y'].mean()\n",
    "    up = df.loc[df.co==c,'Y'].quantile(0.99)\n",
    "    low = df.loc[df.co==c,'Y'].quantile(0.01)\n",
    "    df.loc[df.co==c,'Y'] = df.loc[df.co==c,'Y'].clip(low,up)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df['logY'] = np.where((df['Y'].isna()==False),np.log(df['Y']),np.nan)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df['Y'].describe()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Simple means by country and year. Still in national currencies. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.groupby(['year','co']).mean()['Y'].unstack()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "A look at what the dataframe looks like"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Saving the output file"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.to_pickle(data_path+'echp_incomes.pkl')\n"
   ]
  }
 ],
 "metadata": {
  "interpreter": {
   "hash": "cf2a50979671a58939829e6829efb726aa5da11149213b77bd50351f899d04fb"
  },
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
